common R snippets

Must Watch!



MustWatch



Vectors, lists, matrices, data frames

# Goals: A first look at R objects - vectors, lists, matrices, data frames. # To make vectors "x" "y" "year" and "names" x <- c(2,3,7,9) y <- c(9,7,3,2) year <- 1990:1993 names <- c("payal", "shraddha", "kritika", "itida") # Accessing the 1st and last elements of y -- y[1] y[length(y)] # To make a list "person" -- person <- list(name="payal", x=2, y=9, year=1990) person # Accessing things inside a list -- person$name person$x # To make a matrix, pasting together the columns "year" "x" and "y" # The verb cbind() stands for "column bind" cbind(year, x, y) # To make a "data frame", which is a list of vectors of the same length -- D <- data.frame(names, year, x, y) nrow(D) # Accessing one of these vectors D$names # Accessing the last element of this vector D$names[nrow(D)] # Or equally, D$names[length(D$names)]

Sorting

# Goal: To do sorting. # # The approach here needs to be explained. If `i' is a vector of # integers, then the data frame D[i,] picks up rows from D based # on the values found in `i'. # # The order() function makes an integer vector which is a correct # ordering for the purpose of sorting. D <- data.frame(x=c(1,2,3,1), y=c(7,19,2,2)) D # Sort on x indexes <- order(D$x) D[indexes,] # Print out sorted dataset, sorted in reverse by y D[rev(order(D$y)),]

Prices and returns

# Goal: Prices and returns # I like to multiply returns by 100 so as to have "units in percent". # In other words, I like it for 5% to be a value like 5 rather than 0.05. ## # I. Simulate random-walk prices, switch between prices & returns. ## # Simulate a time-series of PRICES drawn from a random walk # where one-period returns are i.i.d. N(mu, sigma^2). ranrw <- function(mu, sigma, p0=100, T=100) { cumprod(c(p0, 1 + (rnorm(n=T, mean=mu, sd=sigma)/100))) } prices2returns <- function(x) { 100*diff(log(x)) } returns2prices <- function(r, p0=100) { c(p0, p0 * exp(cumsum(r/100))) } cat("Simulate 25 points from a random walk starting at 1500 --\n") p <- ranrw(0.05, 1.4, p0=1500, T=25) # gives you a 25-long series, starting with a price of 1500, where # one-period returns are N(0.05,1.4^2) percent. print(p) cat("Convert to returns--\n") r <- prices2returns(p) print(r) cat("Go back from returns to prices --\n") goback <- returns2prices(r, 1500) print(goback) ## # II. Plenty of powerful things you can do with returns.... ## summary(r); sd(r) # summary statistics plot(density(r)) # kernel density plot acf(r) # Autocorrelation function ar(r) # Estimate a AIC-minimising AR model Box.test(r, lag=2, type="Ljung") # Box-Ljung test library(tseries) runs.test(factor(sign(r))) # Runs test bds.test(r) # BDS test. ## # III. Visualisation and the random walk ## # I want to obtain intuition into what kinds of price series can happen, # given a starting price, a mean return, and a given standard deviation. # This function simulates out 10000 days of a price time-series at a time, # and waits for you to click in the graph window, after which a second # series is painted, and so on. Make the graph window very big and # sit back and admire. # The point is to eyeball many series and thus obtain some intuition # into what the random walk does. visualisation <- function(p0, s, mu, labelstring) { N <- 10000 x <- (1:(N+1))/250 # Unit of years while (1) { plot(x, ranrw(mu, s, p0, N), ylab="Level", log="y", type="l", col="red", xlab="Time (years)", main=paste("40 years of a process much like", labelstring)) grid() z=locator(1) } } # Nifty -- assuming sigma of 1.4% a day and E(returns) of 13% a year visualisation(2600, 1.4, 13/250, "Nifty") # The numerical values here are used to think about what the INR/USD # exchange rate would have looked like if it started from 31.37, had # a mean depreciation of 5% per year, and had the daily vol of a floating # exchange rate like EUR/USD. visualisation(31.37, 0.7, 5/365, "INR/USD (NOT!) with daily sigma=0.7") # This is of course not like the INR/USD series in the real world - # which is neither a random walk nor does it have a vol of 0.7% a day. # The numerical values here are used to think about what the USD/EUR # exchange rate, starting with 1, having no drift, and having the observed # daily vol of 0.7. (This is about right). visualisation(1, 0.7, 0, "USD/EUR with no drift") ## # IV. A monte carlo experiment about the runs test ## # Measure the effectiveness of the runs test when faced with an # AR(1) process of length 100 with a coeff of 0.1 set.seed(101) one.ts <- function() {arima.sim(list(order = c(1,0,0), ar = 0.1), n=100)} table(replicate(1000, runs.test(factor(sign(one.ts())))$p.value < 0.05)) # We find that the runs test throws up a prob value of below 0.05 # for 91 out of 1000 experiments. # Wow! :-) # To understand this, you need to look up the man pages of: # set.seed, arima.sim, sign, factor, runs.test, replicate, table. # e.g. say ?replicate

Writing functions

# Goals: To write functions # To write functions that send back multiple objects. # FIRST LEARN ABOUT LISTS -- X = list(height=5.4, weight=54) print("Use default printing --") print(X) print("Accessing individual elements --") cat("Your height is ", X$height, " and your weight is ", X$weight, "\n") # FUNCTIONS -- square <- function(x) { return(x*x) } cat("The square of 3 is ", square(3), "\n") # default value of the arg is set to 5. cube <- function(x=5) { return(x*x*x); } cat("Calling cube with 2 : ", cube(2), "\n") # will give 2^3 cat("Calling cube : ", cube(), "\n") # will default to 5^3. # LEARN ABOUT FUNCTIONS THAT RETURN MULTIPLE OBJECTS -- powers <- function(x) { parcel = list(x2=x*x, x3=x*x*x, x4=x*x*x*x); return(parcel); } X = powers(3); print("Showing powers of 3 --"); print(X); # WRITING THIS COMPACTLY (4 lines instead of 7) powerful <- function(x) { return(list(x2=x*x, x3=x*x*x, x4=x*x*x*x)); } print("Showing powers of 3 --"); print(powerful(3)); # In R, the last expression in a function is, by default, what is # returned. So you could equally just say: powerful <- function(x) {list(x2=x*x, x3=x*x*x, x4=x*x*x*x)}

Amazing R vector notation

# Goal: The amazing R vector notation. cat("EXAMPLE 1: sin(x) for a vector --\n") # Suppose you have a vector x -- x = c(0.1,0.6,1.0,1.5) # The bad way -- n = length(x) r = numeric(n) for (i in 1:n) { r[i] = sin(x[i]) } print(r) # The good way -- don't use loops -- print(sin(x)) cat("\n\nEXAMPLE 2: Compute the mean of every row of a matrix --\n") # Here's another example. It isn't really about R; it's about thinking in # matrix notation. But still. # Let me setup a matrix -- N=4; M=100; r = matrix(runif(N*M), N, M) # So I face a NxM matrix # [r11 r12 ... r1N] # [r21 r22 ... r2N] # [r32 r32 ... r3N] # My goal: each column needs to be reduced to a mean. # Method 1 uses loops: mean1 = numeric(M) for (i in 1:M) { mean1[i] = mean(r[,i]) } # Alternatively, just say: mean2 = rep(1/N, N) %*% r # Pretty! # The two answers are the same -- all.equal(mean1,mean2[,]) # # As an aside, I should say that you can do this directly by using # the rowMeans() function. But the above is more about pedagogy rather # than showing you how to get rowmeans. cat("\n\nEXAMPLE 3: Nelson-Siegel yield curve\n") # Write this asif you're dealing with scalars -- # Nelson Siegel function nsz <- function(b0, b1, b2, tau, t) { tmp = t/tau tmp2 = exp(-tmp) return(b0 + ((b1+b2)*(1-tmp2)/(tmp)) - (b2*tmp2)) } timepoints <- c(0.01,1:5) # The bad way: z <- numeric(length(timepoints)) for (i in 1:length(timepoints)) { z[i] <- nsz(14.084,-3.4107,0.0015,1.8832,timepoints[i]) } print(z) # The R way -- print(z <- nsz(14.084,-3.4107,0.0015,1.8832,timepoints)) cat("\n\nEXAMPLE 3: Making the NPV of a bond--\n") # You know the bad way - sum over all cashflows, NPVing each. # Now look at the R way. C = rep(100, 6) nsz(14.084,-3.4107,0.0015,1.8832,timepoints) # Print interest rates C/((1.05)^timepoints) # Print cashflows discounted @ 5% C/((1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints))^timepoints)) # Using NS instead of 5% # NPV in two different ways -- C %*% (1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints)))^-timepoints sum(C * (1 + (0.01*nsz(14.084,-3.4107,0.0015,1.8832,timepoints)))^-timepoints) # You can drop back to a flat yield curve at 5% easily -- sum(C * 1.05^-timepoints) # Make a function for NPV -- npv <- function(C, timepoints, r) { return(sum(C * (1 + (0.01*r))^-timepoints)) } npv(C, timepoints, 5) # Bottom line: Here's how you make the NPV of a bond with cashflows C # at timepoints timepoints when the zero curve is a Nelson-Siegel curve -- npv(C, timepoints, nsz(14.084,-3.4107,0.0015,1.8832,timepoints)) # Wow! # --------------------------------------------------------------------------- # Elegant vector notation is amazingly fast (in addition to being beautiful) N <- 1e5 x <- runif(N, -3,3) y <- runif(N) method1 <- function(x,y) { tmp <- NULL for (i in 1:N) { if (x[i] < 0) tmp <- c(tmp, y[i]) } tmp } method2 <- function(x,y) { y[x < 0] } s1 <- system.time(ans1 <- method1(x,y)) s2 <- system.time(ans2 <- method2(x,y)) all.equal(ans1,ans2) s1/s2 # On my machine it's 2000x faster

Amazing R indexing notation

# Goal: To show amazing R indexing notation, and the use of is.na() x <- c(2,7,9,2,NA,5) # An example vector to play with. # Give me elems 1 to 3 -- x[1:3] # Give me all but elem 1 -- x[-1] # Odd numbered elements -- indexes <- seq(1,6,2) x[indexes] # or, more compactly, x[seq(1,6,2)] # Access elements by specifying "on" / "off" through booleans -- require <- c(TRUE,TRUE,FALSE,FALSE,FALSE,FALSE) x[require] # Short vectors get reused! So, to get odd numbered elems -- x[c(TRUE,FALSE)] # Locate missing data -- is.na(x) # Replace missing data by 0 -- x[is.na(x)] <- 0 x # Similar ideas work for matrices -- y <- matrix(c(2,7,9,2,NA,5), nrow=2) y # Make a matrix containing columns 1 and 3 -- y[,c(1,3)] # Let us see what is.na(y) does -- is.na(y) str(is.na(y)) # So is.na(y) gives back a matrix with the identical structure as that of y. # Hence I can say y[is.na(y)] <- -1 y

Making latex tabular objects

# Goal: To make latex tabular out of an R matrix # Setup a nice R object: m <- matrix(rnorm(8), nrow=2) rownames(m) <- c("Age", "Weight") colnames(m) <- c("Person1", "Person2", "Person3", "Person4") m # Translate it into a latex tabular: library(xtable) xtable(m, digits=rep(3,5)) # Production latex code that goes into a paper or a book -- print(xtable(m, caption="String", label="t:"), type="latex", file="blah.gen", table.placement="tp", latex.environments=c("center", "footnotesize")) # Now you do \input{blah.gen} in your latex file. # You're lazy, and want to use R to generate latex tables for you? data <- cbind( c(7,9,11,2), c(2,4,19,21) ) colnames(data) <- c("a","b") rownames(data) <- c("x","y","z","a") xtable(data) # or you could do data <- rbind( c(7,2), c(9,4), c(11,19), c(2,21) ) # and the rest goes through identically.

Associative arrays / hashes

# Goal: Associative arrays (as in awk) or hashes (as in perl). # Or, more generally, adventures in R addressing. # Here's a plain R vector: x <- c(2,3,7,9) # But now I tag every elem with labels: names(x) <- c("kal","sho","sad","aja") # Associative array operations: x["kal"] <- 12 # Pretty printing the entire associative array: x # This works for matrices too: m <- matrix(runif(10), nrow=5) rownames(m) <- c("violet","indigo","blue","green","yellow") colnames(m) <- c("Asia","Africa") # The full matrix -- m # Or even better -- library(xtable) xtable(m) # Now address symbolically -- m[,"Africa"] m["indigo",] m["indigo","Africa"] # The "in" operator, as in awk -- for (colour in c("yellow", "orange", "red")) { if (colour %in% rownames(m)) { cat("For Africa and ", colour, " we have ", m[colour, "Africa"], "\n") } else { cat("Colour ", colour, " does not exist in the hash.\n") } } # This works for data frames also -- D <- data.frame(m) D # Look closely at what happened -- str(D) # The colours are the rownames(D). # Operations -- D$Africa D[,"Africa"] D["yellow",] # or subset(D, rownames(D)=="yellow") colnames(D) <- c("Antarctica","America") D D$America

Matrix notation (portfolio computations in financial economics)

# Goal: Utilise matrix notation # We use the problems of portfolio analysis as an example. # Prices of 4 firms to play with, at weekly frequency (for calendar 2004) -- p <- structure(c(300.403, 294.604, 291.038, 283.805, 270.773, 275.506, 292.271, 292.837, 284.872, 295.037, 280.939, 259.574, 250.608, 268.84, 266.507, 263.94, 273.173, 238.609, 230.677, 192.847, 219.078, 201.846, 210.279, 193.281, 186.748, 197.314, 202.813, 204.08, 226.044, 242.442, 261.274, 269.173, 256.05, 259.75, 243, 250.3, 263.45, 279.5, 289.55, 291.95, 302.1, 284.4, 283.5, 287.8, 298.3, 307.6, 307.65, 311.9, 327.7, 318.1, 333.6, 358.9, 385.1, 53.6, 51.95, 47.65, 44.8, 44.85, 44.3, 47.1, 44.2, 41.8, 41.9, 41, 35.3, 33.35, 35.6, 34.55, 35.55, 40.05, 35, 34.85, 28.95, 31, 29.25, 29.05, 28.95, 24.95, 26.15, 28.35, 29.4, 32.55, 37.2, 39.85, 40.8, 38.2, 40.35, 37.55, 39.4, 39.8, 43.25, 44.75, 47.25, 49.6, 47.6, 46.35, 49.4, 49.5, 50.05, 50.5, 51.85, 56.35, 54.15, 58, 60.7, 62.7, 293.687, 292.746, 283.222, 286.63, 259.774, 259.257, 270.898, 250.625, 242.401, 248.1, 244.942, 239.384, 237.926, 224.886, 243.959, 270.998, 265.557, 257.508, 258.266, 257.574, 251.917, 250.583, 250.783, 246.6, 252.475, 266.625, 263.85, 249.925, 262.9, 264.975, 273.425, 275.575, 267.2, 282.25, 284.25, 290.75, 295.625, 296.25, 291.375, 302.225, 318.95, 324.825, 320.55, 328.75, 344.05, 345.925, 356.5, 368.275, 374.825, 373.525, 378.325, 378.6, 374.4, 1416.7, 1455.15, 1380.97, 1365.31, 1303.2, 1389.64, 1344.05, 1266.29, 1265.61, 1312.17, 1259.25, 1297.3, 1327.38, 1250, 1328.03, 1347.46, 1326.79, 1286.54, 1304.84, 1272.44, 1227.53, 1264.44, 1304.34, 1277.65, 1316.12, 1370.97, 1423.35, 1382.5, 1477.75, 1455.15, 1553.5, 1526.8, 1479.85, 1546.8, 1565.3, 1606.6, 1654.05, 1689.7, 1613.95, 1703.25, 1708.05, 1786.75, 1779.75, 1906.35, 1976.6, 2027.2, 2057.85, 2029.6, 2051.35, 2033.4, 2089.1, 2065.2, 2091.7), .Dim = c(53, 4), .Dimnames = list(NULL, c("TISCO", "SAIL", "Wipro", "Infosys"))) # Shift from prices to returns -- r <- 100*diff(log(p)) # Historical expected returns -- colMeans(r) # Historical correlation matrix -- cor(r) # Historical covariance matrix -- S <- cov(r) S # Historical portfolio variance for a stated portfolio of 20%,20%,30%,30% -- w <- c(.2, .2, .3, .3) t(w) %*% S %*% w # The portfolio optimisation function in tseries -- library(tseries) optimised <- portfolio.optim(r) # This uses the historical facts from r optimised$pw # Weights optimised$pm # Expected return using these weights optimised$ps # Standard deviation of optimised port.

Handling missing data

# Goal: # A stock is traded on 2 exchanges. # Price data is missing at random on both exchanges owing to non-trading. # We want to make a single price time-series utilising information # from both exchanges. I.e., missing data for exchange 1 will # be replaced by information for exchange 2 (if observed). # Let's create some example data for the problem. e1 <- runif(15) # Prices on exchange 1 e2 <- e1 + 0.05*rnorm(15) # Prices on exchange 2. cbind(e1, e2) # Blow away 5 points from each at random. e1[sample(1:15, 5)] <- NA e2[sample(1:15, 5)] <- NA cbind(e1, e2) # Now how do we reconstruct a time-series that tries to utilise both? combined <- e1 # Do use the more liquid exchange here. missing <- is.na(combined) combined[missing] <- e2[missing] # if it's also missing, I don't care. cbind(e1, e2, combined) # There you are.

Reading files

Reading a file with a few columns of numbers, and look at what is there.

# Goal: To read in a simple data file, and look around it's contents. # Suppose you have a file "x.data" which looks like this: # 1997,3.1,4 # 1998,7.2,19 # 1999,1.7,2 # 2000,1.1,13 # To read it in -- A <- read.table("x.data", sep=",", col.names=c("year", "my1", "my2")) nrow(A) # Count the rows in A summary(A$year) # The column "year" in data frame A # is accessed as A$year A$newcol <- A$my1 + A$my2 # Makes a new column in A newvar <- A$my1 - A$my2 # Makes a new R object "newvar" A$my1 <- NULL # Removes the column "my1" # You might find these useful, to "look around" a dataset -- str(A) summary(A) library(Hmisc) # This requires that you've installed the Hmisc package contents(A) describe(A)

Reading a file involving dates

# Goal: To read in a simple data file where date data is present. # Suppose you have a file "x.data" which looks like this: # 1997-07-04,3.1,4 # 1997-07-05,7.2,19 # 1997-07-07,1.7,2 # 1997-07-08,1.1,13 A <- read.table("x.data", sep=",", col.names=c("date", "my1", "my2")) A$date <- as.Date(A$date, format="%Y-%m-%d") # Say ?strptime to learn how to use "%" to specify # other date formats. Two examples -- # "15/12/2002" needs "%d/%m/%Y" # "03 Jun 1997" needs "%d %b %Y" # Actually, if you're using the ISO 8601 date format, i.e. # "%Y-%m-%d", that's the default setting and you don't need to # specify the format. A$newcol <- A$my1 + A$my2 # Makes a new column in A newvar <- A$my1 - A$my2 # Makes a new R object "newvar" A$my1 <- NULL # Delete the `my1' column summary(A) # Makes summary statistics

Reading in a file made by CMIE's Business Beacon program

# Goal: To read in files produced by CMIE's "Business Beacon". # This assumes you have made a file of MONTHLY data using CMIE's # Business Beacon program. This contains 2 columns: M3 and M0. A <- read.table( # Generic to all BB files -- sep="|", # CMIE's .txt file is pipe delimited skip=3, # Skip the 1st 3 lines na.strings=c("N.A.","Err"), # The ways they encode missing data # Specific to your immediate situation -- file="bb_data.text", col.names=c("junk", "date", "M3", "M0") ) A$junk <- NULL # Blow away this column # Parse the CMIE-style "Mmm yy" date string that's used on monthly data A$date <- as.Date(paste("1", as.character(A$date)), format="%d %b %Y") Reading and writing both ascii files and binary files. Also, measure speed of

these.

# Goal: Reading and writing ascii files, reading and writing binary files. # And, to measure how much faster it is working with binary files. # First manufacture a tall data frame: # FYI -- runif(10) yields 10 U(0,1) random numbers. B = data.frame(x1=runif(100000), x2=runif(100000), x3=runif(100000)) summary(B) # Write out ascii file: write.table(B, file = "/tmp/foo.csv", sep = ",", col.names = NA) # Read in this resulting ascii file: C=read.table("/tmp/foo.csv", header = TRUE, sep = ",", row.names=1) # Write a binary file out of dataset C: save(C, file="/tmp/foo.binary") # Delete the dataset C: rm(C) # Restore from foo.binary: load("/tmp/foo.binary") summary(C) # should yield the same results # as summary(B) above. # Now we time all these operations -- cat("Time creation of dataset:\n") system.time({ B = data.frame(x1=runif(100000), x2=runif(100000), x3=runif(100000)) }) cat("Time writing an ascii file out of dataset B:\n") system.time( write.table(B, file = "/tmp/foo.csv", sep = ",", col.names = NA) ) cat("Time reading an ascii file into dataset C:\n") system.time( {C=read.table("/tmp/foo.csv", header = TRUE, sep=",", row.names=1) }) cat("Time writing a binary file out of dataset C:\n") system.time(save(C, file="/tmp/foo.binary")) cat("Time reading a binary file + variablenames from /tmp/foo.binary:\n") system.time(load("/tmp/foo.binary")) # and then read it in from binary file

Sending an R data object to someone else, either in email or as a binary

file. # Goals: Lots of times, you need to give an R object to a friend, # or embed data into an email. # First I invent a little dataset -- set.seed(101) # To make sure you get the same random numbers as me # FYI -- runif(10) yields 10 U(0,1) random numbers. A = data.frame(x1=runif(10), x2=runif(10), x3=runif(10)) # Look at it -- print(A) # Writing to a binary file that can be transported save(A, file="/tmp/my_data_file.rda") # You can give this file to a friend load("/tmp/my_data_file.rda") # Plan B - you want pure ascii, which can be put into an email -- dput(A) # This gives you a block of R code. Let me utilise that generated code # to create a dataset named "B". B <- structure(list(x1 = c(0.372198376338929, 0.0438248154241592, 0.709684018278494, 0.657690396532416, 0.249855723232031, 0.300054833060130, 0.584866625955328, 0.333467143354937, 0.622011963743716, 0.54582855431363 ), x2 = c(0.879795730113983, 0.706874740775675, 0.731972594512627, 0.931634427979589, 0.455120594473556, 0.590319729177281, 0.820436094887555, 0.224118480458856, 0.411666829371825, 0.0386105608195066), x3 = c(0.700711545301601, 0.956837461562827, 0.213352001970634, 0.661061500199139, 0.923318882007152, 0.795719761401415, 0.0712125543504953, 0.389407767681405, 0.406451216200367, 0.659355078125373)), .Names = c("x1", "x2", "x3"), row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), class = "data.frame") # Verify that A and B are near-identical -- A-B # or, all.equal(A,B)

Make a "zoo" object, for handling time-series data.

# Goal: Make a time-series object using the "zoo" package A <- data.frame(date=c("1995-01-01", "1995-01-02", "1995-01-03", "1995-01-06"), x=runif(4), y=runif(4)) A$date <- as.Date(A$date) # yyyy-mm-dd is the default format # So far there's nothing new - it's just a data frame. I have hand- # constructed A but you could equally have obtained it using read.table(). # I want to make a zoo matrix out of the numerical columns of A library(zoo) B <- A B$date <- NULL z <- zoo(as.matrix(B), order.by=A$date) rm(A, B) # So now you are holding "z", a "zoo" object. You can do many cool # things with it. # See http://www.google.com/search?hl=en&q=zoo+quickref+achim&btnI=I%27m+Feeling+Lucky # To drop down to a plain data matrix, say C <- coredata(z) rownames(C) <- as.character(time(z)) # Compare -- str(C) str(z) # The above is a tedious way of doing these things, designed to give you # an insight into what is going on. If you just want to read a file # into a zoo object, a very short path is something like: # z <- read.zoo(filename, format="%d %b %Y")

Exporting and importing data.

# Goal: All manner of import and export of datasets. # Invent a dataset -- A <- data.frame( name=c("a","b","c"), ownership=c("Case 1","Case 1","Case 2"), listed.at=c("NSE",NA,"BSE"), # Firm "b" is unlisted. is.listed=c(TRUE,FALSE,TRUE), # R convention - boolean variables are named "is.something" x=c(2.2,3.3,4.4), date=as.Date(c("2004-04-04","2005-05-05","2006-06-06")) ) # To a spreadsheet through a CSV file -- write.table(A,file="demo.csv",sep = ",",col.names = NA,qmethod = "double") B <- read.table("demo.csv", header = TRUE, sep = ",", row.names = 1) # To R as a binary file -- save(A, file="demo.rda") load("demo.rda") # To the Open XML standard for transport for statistical data -- library(StatDataML) writeSDML(A, "/tmp/demo.sdml") B <- readSDML("/tmp/demo.sdml") # To Stata -- library(foreign) write.dta(A, "/tmp/demo.dta") B <- read.dta("/tmp/demo.dta") # foreign::write.foreign() also has a pathway to SAS and SPSS.

Reading .gz .bz2 files and URLs

# Goal: Special cases in reading files # Reading in a .bz2 file -- read.table(bzfile("file.text.bz2")) # Requires you have ./file.text.bz2 # Reading in a .gz file -- read.table(gzfile("file.text.gz")) # Requires you have ./file.text.bz2 # Reading from a pipe -- mydata <- read.table(pipe("awk -f filter.awk input.txt")) # Reading from a URL -- read.table(url("http://www.mayin.org/ajayshah/A/demo.text")) # This also works -- read.table("http://www.mayin.org/ajayshah/A/demo.text") # Hmm, I couldn't think of how to read a .bz2 file from a URL. How about: read.table(pipe("links -source http://www.mayin.org/ajayshah/A/demo.text.bz2 | bunzip2")) # Reading binary files from a URL -- load(url("http://www.mayin.org/ajayshah/A/nifty_weekly_returns.rda"))

Directly reading Microsoft Excel files

# Goal: Reading in a Microsoft .xls file directly library(gdata) a <- read.xls("file.xls", sheet=2) # This reads in the 2nd sheet # Look at what the cat dragged in str(a) # If you have a date column, you'll want to fix it up like this: a$date <- as.Date(as.character(a$X), format="%d-%b-%y") a$X <- NULL # Also see http://tolstoy.newcastle.edu.au/R/help/06/04/25674.html for # another path.

Graphs

A grid of multiple pictures on one screen

# Goal: To make a panel of pictures. par(mfrow=c(3,2)) # 3 rows, 2 columns. # Now the next 6 pictures will be placed on these 6 regions. :-) # Let me take some pains on the 1st plot(density(runif(100)), lwd=2) text(x=0, y=0.2, "100 uniforms") # Showing you how to place text at will abline(h=0, v=0) # All these statements effect the 1st plot. x=seq(0.01,1,0.01) par(col="blue") # default colour to blue. # 2 -- plot(x, sin(x), type="l") lines(x, cos(x), type="l", col="red") # 3 -- plot(x, exp(x), type="l", col="green") lines(x, log(x), type="l", col="orange") # 4 -- plot(x, tan(x), type="l", lwd=3, col="yellow") # 5 -- plot(x, exp(-x), lwd=2) lines(x, exp(x), col="green", lwd=3) # 6 -- plot(x, sin(x*x), type="l") lines(x, sin(1/x), col="pink")

Making PDF files that go into books/papers

# Goal: Make pictures in PDF files that can be put into a paper. xpts <- seq(-3,3,.05) # Here is my suggested setup for a two-column picture -- pdf("demo2.pdf", width=5.6, height=2.8, bg="cadetblue1", pointsize=8) par(mai=c(.6,.6,.2,.2)) plot(xpts, sin(xpts*xpts), type="l", lwd=2, col="cadetblue4", xlab="x", ylab="sin(x*x)") grid(col="white", lty=1, lwd=.2) abline(h=0, v=0) # My suggested setup for a square one-column picture -- pdf("demo1.pdf", width=2.8, height=2.8, bg="cadetblue1", pointsize=8) par(mai=c(.6,.6,.2,.2)) plot(xpts, sin(xpts*xpts), type="l", lwd=2, col="cadetblue4", xlab="x", ylab="sin(x*x)") grid(col="white", lty=1, lwd=.2) abline(h=0, v=0)

A histogram with tails in red

# Goal: A histogram with tails shown in red. # This happened on the R mailing list on 7 May 2004. # This is by Martin Maechler <maechler@stat.math.ethz.ch>, who was # responding to a slightly imperfect version of this by # "Guazzetti Stefano" <Stefano.Guazzetti@ausl.re.it> x <- rnorm(1000) hx <- hist(x, breaks=100, plot=FALSE) plot(hx, col=ifelse(abs(hx$breaks) < 1.669, 4, 2)) # What is cool is that "col" is supplied a vector.

z=f(x,y) using contour lines and colours

# Goal: Visualisation of 3-dimensional (x,y,z) data using contour # plots and using colour to represent the 3rd dimension. # The specific situation is: On a grid of (x,y) points, you have # evaluated f(x,y). Now you want a graphical representation of # the resulting list of (x,y,z) points that you have. # Setup an interesting data matrix of (x,y,z) points: points <- structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.45, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.55, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.65, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0.998, 0.124, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0.998, 0.71, 0.068, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0.998, 0.898, 0.396, 0.058, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0.998, 0.97, 0.726, 0.268, 0.056, 0.006, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0.996, 0.88, 0.546, 0.208, 0.054, 0.012, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.998, 0.964, 0.776, 0.418, 0.18, 0.054, 0.014, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.998, 0.906, 0.664, 0.342, 0.166, 0.056, 0.018, 0.006, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.986, 0.862, 0.568, 0.29, 0.15, 0.056, 0.022, 0.008, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.954, 0.778, 0.494, 0.26, 0.148, 0.056, 0.024, 0.012, 0.004, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.906, 0.712, 0.43, 0.242, 0.144, 0.058, 0.028, 0.012, 0.006, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.878, 0.642, 0.38, 0.222, 0.142, 0.066, 0.034, 0.014, 0.008, 0.004, 0.002, 0, 0, 0, 0, 0, 0, 0, 0, 0.846, 0.586, 0.348, 0.208, 0.136, 0.068, 0.034, 0.016, 0.012, 0.006, 0.004, 0.002, 0, 0, 0, 0, 0, 0, 0, 0.8, 0.538, 0.318, 0.204, 0.136, 0.07, 0.046, 0.024, 0.012, 0.008, 0.004, 0.002, 0.002, 0, 0, 0, 0, 0, 0, 0.762, 0.496, 0.294, 0.2, 0.138, 0.072, 0.05, 0.024, 0.014, 0.012, 0.006, 0.004, 0.002, 0.002, 0, 0, 0, 0, 0, 0.704, 0.472, 0.286, 0.198, 0.138, 0.074, 0.054, 0.028, 0.016, 0.012, 0.008, 0.006, 0.004, 0.002, 0.002, 0, 0, 0, 0, 0.668, 0.438, 0.276, 0.196, 0.138, 0.078, 0.054, 0.032, 0.024, 0.014, 0.012, 0.008, 0.004, 0.004, 0.002, 0.002, 0, 0, 0, 0.634, 0.412, 0.27, 0.194, 0.14, 0.086, 0.056, 0.032, 0.024, 0.016, 0.012, 0.01, 0.006, 0.004, 0.004, 0.002, 0.002, 0, 0, 0.604, 0.388, 0.26, 0.19, 0.144, 0.088, 0.058, 0.048, 0.026, 0.022, 0.014, 0.012, 0.008, 0.006, 0.004, 0.004, 0.002, 0.002, 0, 0.586, 0.376, 0.256, 0.19, 0.146, 0.094, 0.062, 0.052, 0.028, 0.024, 0.014, 0.012, 0.012, 0.008, 0.004, 0.004, 0.004, 0.002, 0.002, 0.566, 0.364, 0.254, 0.192, 0.148, 0.098, 0.064, 0.054, 0.032, 0.024, 0.022, 0.014, 0.012, 0.012, 0.008, 0.004, 0.004, 0.004, 0.002), .Dim = c(399, 3), .Dimnames = list(NULL, c("x", "y", "z"))) # Understand this object -- summary(points) # x is a grid from 0 to 1 # y is a grid from 20 to 200 # z is the interesting object which will be the 3rd dimension. # Solution using contourplot() from package 'lattice' library(lattice) d3 <- data.frame(points) contourplot(z ~ x+y, data=d3) ## or nicer contourplot(z ~ x+y, data=d3, cuts=20, region = TRUE) ## or using logit - transformed z values: contourplot(qlogis(z) ~ x+y, data=d3, pretty=TRUE, region = TRUE) # An interesting alternative is levelplot() levelplot(z ~ x+y, pretty=TRUE, contour=TRUE, data=d3) # There is a contour() function in R. Even though it sounds obvious # for the purpose, it is a bit hard to use. # contour() wants 3 inputs: vectors of x and y values, and a matrix of # z values, where the x values correspond to the rows of z, and the y # values to the columns. A collection of points like `points' above # needs to be turned into such a grid. It might sound odd, but contour() # image() and persp() have used this kind of input for the longest time. # # For irregular data, there's an interp function in the akima package # that can convert from irregular data into the grid format. # # The `points' object that I have above - a list of (x,y,z) points - # fits directly into the mentality of lattice::contourplot() but not # into the requirements of contour()

Show recessions using filled colour in a macro time-series plot

# Goal: Display of a macroeconomic time-series, with a filled colour # bar showing a recession. years <- 1950:2000 timeseries <- cumsum(c(100, runif(50)*5)) hilo <- range(timeseries) plot(years, timeseries, type="l", lwd=3) # A recession from 1960 to 1965 -- polygon(x=c(1960,1960, 1965,1965), y=c(hilo, rev(hilo)), density=NA, col="orange", border=NA) lines(years, timeseries, type="l", lwd=3) # paint again so line comes on top # alternative method -- though not as good looking -- # library(plotrix) # gradient.rect(1960, hilo[1], 1965, hilo[2], # reds=c(0,1), greens=c(0,0), blues=c(0,0), # gradient="y") Plotting two series on one graph, one with a left y axis and another with a

right y axis.

# Goal: Display two series on one plot, one with a left y axis # and another with a right y axis. y1 <- cumsum(rnorm(100)) y2 <- cumsum(rnorm(100, mean=0.2)) par(mai=c(.8, .8, .2, .8)) plot(1:100, y1, type="l", col="blue", xlab="X axis label", ylab="Left legend") par(new=TRUE) plot(1:100, y2, type="l", ann=FALSE, yaxt="n") axis(4) legend(x="topleft", bty="n", lty=c(1,1), col=c("blue","black"), legend=c("String 1 (left scale)", "String 2 (right scale)"))

Probability and statistics

Tables, joint and marginal distributions

# Goal: Joint distributions, marginal distributions, useful tables. # First let me invent some fake data set.seed(102) # This yields a good illustration. x <- sample(1:3, 15, replace=TRUE) education <- factor(x, labels=c("None", "School", "College")) x <- sample(1:2, 15, replace=TRUE) gender <- factor(x, labels=c("Male", "Female")) age <- runif(15, min=20,max=60) D <- data.frame(age, gender, education) rm(x,age,gender,education) print(D) # Table about education table(D$education) # Table about education and gender -- table(D$gender, D$education) # Joint distribution of education and gender -- table(D$gender, D$education)/nrow(D) # Add in the marginal distributions also addmargins(table(D$gender, D$education)) addmargins(table(D$gender, D$education))/nrow(D) # Generate a good LaTeX table out of it -- library(xtable) xtable(addmargins(table(D$gender, D$education))/nrow(D), digits=c(0,2,2,2,2)) # You have to do | and \hline manually. # Study age by education category by(D$age, D$gender, mean) by(D$age, D$gender, sd) by(D$age, D$gender, summary) # Two-way table showing average age depending on education & gender a <- matrix(by(D$age, list(D$gender, D$education), mean), nrow=2) rownames(a) <- levels(D$gender) colnames(a) <- levels(D$education) print(a) # or, of course, print(xtable(a))

`Moving window' standard deviation

# Goal: To do `moving window volatility' of returns. library(zoo) # Some data to play with (Nifty on all fridays for calendar 2004) -- p <- structure(c(1946.05, 1971.9, 1900.65, 1847.55, 1809.75, 1833.65, 1913.6, 1852.65, 1800.3, 1867.7, 1812.2, 1725.1, 1747.5, 1841.1, 1853.55, 1868.95, 1892.45, 1796.1, 1804.45, 1582.4, 1560.2, 1508.75, 1521.1, 1508.45, 1491.2, 1488.5, 1537.5, 1553.2, 1558.8, 1601.6, 1632.3, 1633.4, 1607.2, 1590.35, 1609, 1634.1, 1668.75, 1733.65, 1722.5, 1775.15, 1820.2, 1795, 1779.75, 1786.9, 1852.3, 1872.95, 1872.35, 1901.05, 1996.2, 1969, 2012.1, 2062.7, 2080.5), index = structure(c(12419, 12426, 12433, 12440, 12447, 12454, 12461, 12468, 12475, 12482, 12489, 12496, 12503, 12510, 12517, 12524, 12531, 12538, 12545, 12552, 12559, 12566, 12573, 12580, 12587, 12594, 12601, 12608, 12615, 12622, 12629, 12636, 12643, 12650, 12657, 12664, 12671, 12678, 12685, 12692, 12699, 12706, 12713, 12720, 12727, 12734, 12741, 12748, 12755, 12762, 12769, 12776, 12783), class = "Date"), frequency = 0.142857142857143, class = c("zooreg", "zoo")) # Shift to returns -- r <- 100*diff(log(p)) head(r) summary(r) sd(r) # Compute the moving window vol -- vol <- sqrt(250) * rollapply(r, 20, sd, align = "right") # A pretty plot -- plot(vol, type="l", ylim=c(0,max(vol,na.rm=TRUE)), lwd=2, col="purple", xlab="2004", ylab=paste("Annualised sigma, 20-week window")) grid() legend(x="bottomleft", col=c("purple", "darkgreen"), lwd=c(2,2), bty="n", cex=0.8, legend=c("Annualised 20-week vol (left scale)", "Nifty (right scale)")) par(new=TRUE) plot(p, type="l", lwd=2, col="darkgreen", xaxt="n", yaxt="n", xlab=", ylab=") axis(4)

Requires this data file.

# Get the data in place -- load(file="demo.rda") summary(firms) # Look at it -- plot(density(log(firms$mktcap))) plot(firms$mktcap, firms$spread, type="p", cex=.2, col="blue", log="xy", xlab="Market cap (Mln USD)", ylab="Bid/offer spread (bps)") m=lm(log(spread) ~ log(mktcap), firms) summary(m) # Making deciles -- library(gtools) library(gdata) # for deciles (default=quartiles) size.category = quantcut(firms$mktcap, q=seq(0, 1, 0.1), labels=F) table(size.category) means = aggregate(firms, list(size.category), mean) print(data.frame(means$mktcap,means$spread)) # Make a picture combining the sample mean of spread (in each decile) # with the weighted average sample mean of the spread (in each decile), # where weights are proportional to size. wtd.means = by(firms, size.category, function(piece) (sum(piece$mktcap*piece$spread)/sum(piece$mktcap))) lines(means$mktcap, means$spread, type="b", lwd=2, col="green", pch=19) lines(means$mktcap, wtd.means, type="b", lwd=2, col="red", pch=19) legend(x=0.25, y=0.5, bty="n", col=c("blue", "green", "red"), lty=c(0, 1, 1), lwd=c(0,2,2), pch=c(0,19,19), legend=c("firm", "Mean spread in size deciles", "Size weighted mean spread in size deciles")) # Within group standard deviations -- aggregate(firms, list(size.category), sd) # Now I do quartiles by BOTH mktcap and spread. size.quartiles = quantcut(firms$mktcap, labels=F) spread.quartiles = quantcut(firms$spread, labels=F) table(size.quartiles, spread.quartiles) # Re-express everything as joint probabilities table(size.quartiles, spread.quartiles)/nrow(firms) # Compute cell means at every point in the joint table: aggregate(firms, list(size.quartiles, spread.quartiles), mean) # Make pretty two-way tables aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, nobs) aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, mean) aggregate.table(firms$mktcap, size.quartiles, spread.quartiles, sd) aggregate.table(firms$spread, size.quartiles, spread.quartiles, mean) aggregate.table(firms$spread, size.quartiles, spread.quartiles, sd)

Distribution of sample mean and sample median

# Goal: Show the efficiency of the mean when compared with the median # using a large simulation where both estimators are applied on # a sample of U(0,1) uniformly distributed random numbers. one.simulation <- function(N=100) { # N defaults to 100 if not supplied x <- runif(N) return(c(mean(x), median(x))) } # Simulation -- results <- replicate(100000, one.simulation(20)) # Gives back a 2x100000 matrix # Two kernel densities -- k1 <- density(results[1,]) # results[1,] is the 1st row k2 <- density(results[2,]) # A pretty picture -- xrange <- range(k1$x, k2$x) plot(k1$x, k1$y, xlim=xrange, type="l", xlab="Estimated value", ylab=") grid() lines(k2$x, k2$y, col="red") abline(v=.5) legend(x="topleft", bty="n", lty=c(1,1), col=c("black", "red"), legend=c("Mean", "Median"))

The bootstrap[

The package boot has elegant and powerful support for bootstrapping. In order to use it, you have to repackage your estimation function as follows. R has very elegant and abstract notation in array indexes. Suppose there is an integer vector OBS containing the elements 2, 3, 7, i.e. that OBS <- c(2,3,7);. Suppose x is a vector. Then the notation x[OBS] is a vector containing elements x[2], x[3] and x[7]. This beautiful notation works for x as a dataset (data frame) also. Here are demos: # For vectors -- > x = c(10,20,30,40,50) > d = c(3,2,2) > x[d] [1] 30 20 20 # For data frames -- > D = data.frame(x=seq(10,50,10), y=seq(500,100,-100)) > t(D) 1 2 3 4 5 x 10 20 30 40 50 y 500 400 300 200 100 > D[d,] x y 3 30 300 2 20 400 2.1 20 400 Now for the key point: how does the R boot package work? The R package boot repeatedly calls your estimation function, and each time, the bootstrap sample is supplied using an integer vector of indexes like above. Let me show you two examples of how you would write estimation functions which are compatible with the package: samplemean <- function(x, d) { return(mean(x[d])) } samplemedian <- function(x, d) { return(median(x[d])) } The estimation function (that you write) consumes data x and a vector of indices d. This function will be called many times, one for each bootstrap replication. Every time, the data `x' will be the same, and the bootstrap sample `d' will be different. At each call, the boot package will supply a fresh set of indices d. The notation x[d] allows us to make a brand-new vector (the bootstrap sample), which is given to mean() or median(). This reflects sampling with replacement from the original data vector. Once you have written a function like this, here is how you would obtain bootstrap estimates of the standard deviation of the distribution of the median: b = boot(x, samplemedian, R=1000) # 1000 replications The object `b' that is returned by boot() is interesting and useful. Say ?boot to learn about it. For example, after making b as shown above, you can say: print(sd(b$t[,1])) Here, I'm using the fact that b$t is a matrix containing 1000 rows which holds all the results of estimation. The 1st column in it is the only thing being estimated by samplemedian(), which is the sample median. The default plot() operator does nice things when fed with this object. Try it: say plot(b)

Dealing with data frames

Here is an example, which uses the bootstrap to report the ratio of two standard deviations: library(boot) sdratio <- function(D, d) { E=D[d,] return(sd(E$x)/sd(E$y)) } x = runif(100) y = 2*runif(100) D = data.frame(x, y) b = boot(D, sdratio, R=1000) cat("Standard deviation of sdratio = ", sd(b$t[,1]), "\n") ci = boot.ci(b, type="basic") cat("95% CI from ", ci$basic[1,4], " - ", ci$basic[1,5], "\n") Note the beautiful syntax E = D[d,] which gives you a data frame E using the rows out of data frame D that are specified by the integer vector d.

Sending more stuff to your estimation function

Many times, you want to send additional things to your estimation function. You're allowed to say whatever you want to boot(), after you have supplied the two mandatory things that he wants. Here's an example: the trimmed mean. The R function mean() is general, and will also do a trimmed mean. If you say mean(x, 0.1), then it will remove the most extreme 10% of the data at both the top and the bottom, and report the mean of the middle 80%. Suppose you want to explore the sampling characteristics of the trimmed mean using boot(). You would write this: trimmedmean <- function(x, d, trim=0) { return(mean(x[d], trim/length(x))) } Here, I'm defaulting trim to 0. And, I'll allowing the caller to talk in the units of observations, not fractions of the data. So the user would say "5" to trim off the most extreme 5 observations at the top and the bottom. I convert that into fractions before feeding this to mean(). Here's how you would call boot() using this: b = boot(x, trimmedmean, R=1000, trim=5) This sends the extra argument trim=5 to boot, which sends it on to our trimmedmean() function.

Finding out more

The boot() function is very powerful. The above examples only scratch the surface. Among other things, it does things like the block bootstrap for time-series data, randomly censored data, etc. The manual can be accessed by saying: library(boot) ?boot but what you really need is the article Resampling Methods in R: The boot package by Angelo J. Canty, which appeared in the December 2002 issue of R News. Also see the web appendix to An R and S-PLUS Companion to Applied Regression by John Fox [pdf], and a tutorial by Patrick Burns [html]. Return to R by example Ajay Shah ajayshah at mayin dot org

Notes on boot()

# Goals: Do bootstrap inference, as an example, for a sample median. library(boot) samplemedian <- function(x, d) { # d is a vector of integer indexes return(median(x[d])) # The genius is in the x[d] notation } data <- rnorm(50) # Generate a dataset with 50 obs b <- boot(data, samplemedian, R=2000) # 2000 bootstrap replications cat("Sample median has a sigma of ", sd(b$t[,1]), "\n") plot(b) # Make a 99% confidence interval boot.ci(b, conf=0.99, type="basic")

Doing MLE with your own likelihood function

Roll your own likelihood function with R This document assumes you know something about maximum likelihood estimation. It helps you get going in terms of doing MLE in R. All through this, we will use the "ordinary least squares" (OLS) model (a.k.a. "linear regression" or "classical least squares" (CLS)) as the simplest possible example. Here are the formulae for the OLS likelihood, and the notation that I use. There are two powerful optimisers in R: optim() and nlminb(). This note only uses optim(). You should also explore nlminb(). You might find it convenient to snarf a tarfile of all the .R programs involved in this page.

Writing the likelihood function

You have to write an R function which computes out the likelihood function. As always in R, this can be done in several different ways. One issue is that of restrictions upon parameters. When the search algorithm is running, it may stumble upon nonsensical values - such as a sigma below 0 - and you do need to think about this. One traditional way to deal with this is to "transform the parameter space". As an example, for all positive values of sigma, log(sigma) ranges from -infinity to +infinity. So it's safe to do an unconstrained search using log(sigma) as the free parameter. Here is the OLS likelihood, written in a few ways. Confucius he said, when you write a likelihood function, do take the trouble of also writing it's gradient (the vector of first derivatives). You don't absolutely need it, but it's highly recommended. In my toy experiment, this seems to be merely a question of speed - using the analytical gradient makes the MLE go faster. But the OLS likelihood is unique and simple; it is globally quasiconcave and has a clear top. There could not be a simpler task for a maximisation routine. In more complex situations, numerical derivatives are known to give more unstable searches, while analytical derivatives give more reliable answers.

A simulation setup

To use the other files on this page, you need to take my simulation setup file.

Comparing these alternatives

Now that I've written the OLS likelihood function in a few ways, it's natural to ask: Do they all give the same answer? And, which is the fastest? I wrote a simple R program in order to learn these things. This gives the result: True theta = 2 4 6 OLS theta = 2.004311 3.925572 6.188047 Kick the tyres -- lf1() lf1() in logs lf2() lf3() A weird theta 1864.956 1864.956 1864.956 1864.956 True theta 1766.418 1766.418 1766.418 1766.418 OLS theta 1765.589 1765.589 1765.589 1765.589 Cost (ms) 0.450 0.550 1.250 1.000 Derivatives -- first let me do numerically -- Derivative in sigma -- 10.92756 Derivative in intercept -- -8.63967 Derivative in slope -- -11.82872 Analytical derivative in sigma -- 10.92705 Analytical derivative in beta -- -8.642051 -11.82950 This shows us that of the 4 ways of writing it, ols.lf1() is the fastest, and that there is a fair match between my claimed analytical gradient and numerical derivatives.

A minimal program which does the full MLE

Using this foundation, I can jump to a self-contained and minimal R program which does the full job. It gives this result: True theta = 2 4 6 OLS theta = 2.004311 3.925572 6.188047 Gradient-free (constrained optimisation) -- $par [1] 2.000304 3.925571 6.188048 $value [1] 1765.588 $counts function gradient 18 18 $convergence [1] 0 $message [1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH" Using the gradient (constrained optimisation) -- $par [1] 2.000303 3.925571 6.188048 $value [1] 1765.588 $counts function gradient 18 18 $convergence [1] 0 $message [1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH" You say you want a covariance matrix? MLE results -- Coefficient Std. Err. t Sigma 2.000303 0.08945629 22.36068 Intercept 3.925571 0.08792798 44.64530 X 6.188048 0.15377325 40.24138 Compare with the OLS results -- Estimate Std. Error t value Pr(>|t|) (Intercept) 3.925572 0.08801602 44.60065 7.912115e-240 X[, 2] 6.188047 0.15392722 40.20112 6.703474e-211 The file minimal.R also generates this picture:

Measurement about the full MLE

The R optim() function has many different paths to MLE. I wrote a simple R program in order to learn about these. This yields the result: True theta = 2 4 6 OLS theta = 2.004311 3.925572 6.188047 Hit rate Cost L-BFGS-B, analytical 100 25.1 BFGS, analytical 100 33.1 Nelder-Mead, trans. 100 59.2 Nelder-Mead 100 60.5 L-BFGS-B, numerical 100 61.2 BFGS, trans., numerical 100 68.5 BFGS, numerical 100 71.2 SANN 99 4615.5 SANN, trans. 96 4944.9 The algorithms compared above are: L-BFGS-B, analytical. This uses L-BFGS-B which is a variant of BFGS which allows "box" constraints (you can specify a permitted range for each parameter). This uses the ols.gradient() function to do analytical derivatives. It is the fastest (25.1 milliseconds on my machine) and works 100% of the time. BFGS, analytical. This uses BFGS instead of L-BFGS-B -- i.e. no constraints are permitted. Analytical derivatives are used. Nelder-Mead, trans.. Nelder-Mead is a derivative-free algorithm. It does not need you to write the gradient. This variant uses the log() transformation in order to ensure that sigma is positive. Nelder-Mead This is Nelder-Mead without the transformation. L-BFGS-B, numerical This is the same L-BFGS-B but instead of giving him analytical derivative, I leave optim() to fend for himself with numerical derivatives. A worse than doubling of cost! BFGS, trans., numerical This uses plain BFGS, with the log() transformation to ensure that sigma stays positive, but using numerical derivatives. BFGS, numerical This is plain BFGS, with no transformation to ensure a sane sigma, and using numerical derivatives. SANN This is a stochastic search algorithm based on simulated annealing. As you see, it failed for 1% of the runs. It is very costly. The attraction is that it might be more effective at finding global maxima and in "staying out of troublesome territory". SANN trans. This uses the log() transform for sigma and does the search using simulated annealing.

Credits

I learned this with help from Douglas Bates and Spencer Graves, and by lurking on the r-help mailing list, thus learning from everyone who takes the trouble of writing there. Thanks! Back up to my R knowledge base system Ajay Shah ajayshah at mayin dot org

Notes on MLE

# Goal: To do OLS by MLE. # OLS likelihood function # Note: I am going to write the LF using sigma2=sigma^2 and not sigma. ols.lf1 <- function(theta, y, X) { beta <- theta[-1] sigma2 <- theta[1] if (sigma2 <= 0) return(NA) n <- nrow(X) e <- y - X%*%beta # t() = matrix transpose logl <- ((-n/2)*log(2*pi)) - ((n/2)*log(sigma2)) - ((t(e)%*%e)/(2*sigma2)) return(-logl) # since optim() does minimisation by default. } # Analytical derivatives ols.gradient <- function(theta, y, X) { beta <- theta[-1] sigma2 <- theta[1] e <- y - X%*%beta n <- nrow(X) g <- numeric(length(theta)) g[1] <- (-n/(2*sigma2)) + (t(e)%*%e)/(2*sigma2*sigma2) # d logl / d sigma g[-1] <- (t(X) %*% e)/sigma2 # d logl / d beta return(-g) } X <- cbind(1, runif(1000)) theta.true <- c(2,4,6) # error variance = 2, intercept = 4, slope = 6. y <- X %*% theta.true[-1] + sqrt(theta.true[1]) * rnorm(1000) # Estimation by OLS -- d <- summary(lm(y ~ X[,2])) theta.ols <- c(sigma2 = d$sigma^2, d$coefficients[,1]) cat("OLS theta = ", theta.ols, "\n\n") cat("\nGradient-free (constrained optimisation) --\n") optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1, lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), y=y, X=X) cat("\nUsing the gradient (constrained optimisation) --\n") optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1, gr=ols.gradient, lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), y=y, X=X) cat("\n\nYou say you want a covariance matrix?\n") p <- optim(c(1,1,1), method="L-BFGS-B", fn=ols.lf1, gr=ols.gradient, lower=c(1e-6,-Inf,-Inf), upper=rep(Inf,3), hessian=TRUE, y=y, X=X) inverted <- solve(p$hessian) results <- cbind(p$par, sqrt(diag(inverted)), p$par/sqrt(diag(inverted))) colnames(results) <- c("Coefficient", "Std. Err.", "t") rownames(results) <- c("Sigma", "Intercept", "X") cat("MLE results --\n") print(results) cat("Compare with the OLS results --\n") d$coefficients # Picture of how the loglikelihood changes if you perturb the sigma theta <- theta.ols delta.values <- seq(-1.5, 1.5, .01) logl.values <- as.numeric(lapply(delta.values, function(x) {-ols.lf1(theta+c(x,0,0),y,X)})) plot(sqrt(theta[1]+delta.values), logl.values, type="l", lwd=3, col="blue", xlab="Sigma", ylab="Log likelihood") grid()

The strange Cauchy distribution

# Goals: Scare the hell out of children with the Cauchy distribution. # A function which simulates N draws from one of two distributions, # and returns the mean obtained thusly. one.simulation <- function(N=100, distribution="normal") { if (distribution == "normal") { x <- rnorm(N) } else { x <- rcauchy(N) } mean(x) } k1 <- density(replicate(1000, one.simulation(20))) k2 <- density(replicate(1000, one.simulation(20, distribution="cauchy"))) xrange <- range(k1$x, k2$x) plot(k1$x, k1$y, xlim=xrange, type="l", xlab="Estimated value", ylab=") grid() lines(k2$x, k2$y, col="red") abline(v=.5) legend(x="topleft", bty="n", lty=c(1,1), col=c("black", "red"), legend=c("Mean of Normal", "Mean of Cauchy")) # The distribution of the mean of normals collapses into a point; # that of the cauchy does not. # Here's more scary stuff -- for (i in 1:10) { cat("Sigma of distribution of 1000 draws from mean of normal - ", sd(replicate(1000, one.simulation(20))), "\n") } for (i in 1:10) { cat("Sigma of distribution of 1000 draws from mean of cauchy - ", sd(replicate(1000, one.simulation(20, distribution="cauchy"))), "\n") } # Exercise for the reader: Compare the distribution of the median of # the Normal against the distribution of the median of the Cauchy.

An example of simulation-based inference

# Goal: An example of simulation-based inference. # This is in the context of testing for time-series dependence in # stock market returns data. # The code here does the idea of Kim, Nelson, Startz (1991). # We want to use the distribution of realworld returns data, without # needing assumptions about normality. # The null is lack of dependence (i.e. an efficient market). # So repeatedly, the data is permuted, and the sample ACF is computed. # This gives us the distribution of the ACF under H0: independence, but # while using the empirical distribution of the returns data. # Weekly returns on Nifty, 1/1/2002 to 31/12/2003, 104 weeks of data. r <- c(-0.70031182197603, 0.421690133064168, -1.20098072984689, 0.143402360644984, 3.81836537549516, 3.17055939373247, 0.305580301919228, 1.23853814691852, 0.81584795095706, -1.51865139747764, -2.71223626421522, -0.784836480094242, 1.09180041170998, 0.397649587762761, -4.11309534220923, -0.263912425099111, -0.0410144239805454, 1.75756212770972, -2.3335373897992, -2.19228764624217, -3.64578978183987, 1.92535789661354, 3.45782867883164, -2.15532607229374, -0.448039988298987, 1.50124793565896, -1.45871585874362, -2.13459863369767, -6.2128068251802, -1.94482987066289, 0.751294815735637, 1.78244982829590, 1.61567494389745, 1.53557708728931, -1.53557708728931, -0.322061470004265, -2.28394919698225, 0.70399304137414, -2.93580952607737, 2.38125098034425, 0.0617697039252185, -4.14482733720716, 2.04397528093754, 0.576400673606603, 3.43072725191913, 2.96465382864843, 2.89833358015583, 1.85387040058336, 1.52136515035952, -0.637268376944444, 1.75418926224609, -0.804391905851354, -0.861816058320475, 0.576902488444109, -2.84259880663331, -1.35375536139417, 1.49096529042234, -2.05404881010045, 2.86868849528146, -0.258270670200478, -4.4515881438687, -1.73055019137092, 3.04427015714648, -2.94928202352018, 1.62081315773994, -6.83117945164824, -0.962715713711582, -1.75875847071740, 1.50330330252721, -0.0479705789653728, 3.68968303215933, -0.535807567290103, 3.94034871061182, 3.85787174417738, 0.932185956989873, 4.08598654183674, 2.27343783689715, 1.13958830440017, 2.01737201171230, -1.88131458327554, 1.97596267156648, 2.79857144562001, 2.22470306481695, 2.03212951411427, 4.95626853448883, 3.40400972901396, 3.03840139165246, -1.89863129741417, -3.70832135042951, 4.78478922155396, 4.3973589590097, 4.9667050392987, 2.99775078737081, -4.12349101552438, 3.25638269809945, 2.29683376253966, -2.64772825878214, -0.630835277076258, 4.72528848505451, 1.87368447333380, 3.17543946162564, 4.58174427843208, 3.23625985632168, 2.29777651227296) # The 1st autocorrelation from the sample: acf(r, 1, plot=FALSE)$acf[2] # Obtain 1000 draws from the distribution of the 1st autocorrelation # under the null of independence: set.seed <- 101 simulated <- replicate(1000, acf(r[sample(1:104, replace=FALSE)], 1, plot=FALSE)$acf[2]) # At 95% -- quantile(simulated, probs=c(.025,.975)) # At 99% -- quantile(simulated, probs=c(.005,.995)) # So we can reject the null at 95% but not at 99%. # A pretty picture. plot(density(simulated), col="blue") abline(v=0) abline(v=quantile(simulated, probs=c(.025,.975)), lwd=2, col="purple") abline(v=acf(r, 1, plot=FALSE)$acf[2], lty=2, lwd=4, col="yellow")

Four standard operations with standard distributions

# Goal: Standard computations with well-studied distributions. # The normal distribution is named "norm". With this, we have: # Normal density dnorm(c(-1.96,0,1.96)) # Cumulative normal density pnorm(c(-1.96,0,1.96)) # Inverse of this qnorm(c(0.025,.5,.975)) pnorm(qnorm(c(0.025,.5,.975))) # 1000 random numbers from the normal distribution summary(rnorm(1000)) # Here's the same ideas, for the chi-squared distribution with 10 degrees # of freedom. dchisq(c(0,5,10), df=10) # Cumulative normal density pchisq(c(0,5,10), df=10) # Inverse of this qchisq(c(0.025,.5,.975), df=10) # 1000 random numbers from the normal distribution summary(rchisq(1000, df=10))

Two CDFs and a two-sample Kolmogorov-Smirnoff test

# Goal: Given two vectors of data, # superpose their CDFs # and show the results of the two-sample Kolmogorov-Smirnoff test # The function consumes two vectors x1 and x2. # You have to provide a pair of labels as `legendstrings'. # If you supply an xlab, it's used # If you specify log - e.g. log="x" - this is passed on to plot. # The remaining args that you specify are sent on into ks.test() two.cdfs.plot <- function(x1, x2, legendstrings, xlab=", log=", ...) { stopifnot(length(x1)>0, length(x2)>0, length(legendstrings)==2) hilo <- range(c(x1,x2)) par(mai=c(.8,.8,.2,.2)) plot(ecdf(x1), xlim=hilo, verticals=TRUE, cex=0, xlab=xlab, log=log, ylab="Cum. distribution", main=") grid() plot(ecdf(x2), add=TRUE, verticals=TRUE, cex=0, lwd=3) legend(x="bottomright", lwd=c(1,3), lty=1, bty="n", legend=legendstrings) k <- ks.test(x1,x2, ...) text(x=hilo[1], y=c(.9,.85), pos=4, cex=.8, labels=c( paste("KS test statistic: ", sprintf("%.3g", k$statistic)), paste("Prob value: ", sprintf("%.3g", k$p.value)) ) ) k } x1 <- rnorm(100, mean=7, sd=1) x2 <- rnorm(100, mean=9, sd=1) # Check error detection -- two.cdfs.plot(x1,x2) # Typical use -- two.cdfs.plot(x1, x2, c("X1","X2"), xlab="Height (metres)", log="x") # Send args into ks.test() -- two.cdfs.plot(x1, x2, c("X1","X2"), alternative="less")

Simulation to measure size and power of a test

# Goal: Simulation to study size and power in a simple problem. set.seed(101) # The data generating process: a simple uniform distribution with stated mean dgp <- function(N,mu) {runif(N)-0.5+mu} # Simulate one FIXED hypothesis test for H0:mu=0, given a true mu for a sample size N one.test <- function(N, truemu) { x <- dgp(N,truemu) muhat <- mean(x) s <- sd(x)/sqrt(N) # Under the null, the distribution of the mean has standard error s threshold <- 1.96*s (muhat < -threshold) || (muhat > threshold) } # Return of TRUE means reject the null # Do one experiment, where the fixed H0:mu=0 is run Nexperiments times with a sample size N. # We return only one number: the fraction of the time that H0 is rejected. experiment <- function(Nexperiments, N, truemu) { sum(replicate(Nexperiments, one.test(N, truemu)))/Nexperiments } # Measure the size of a test, i.e. rejections when H0 is true experiment(10000, 50, 0) # Measurement with sample size of 50, and true mu of 0. # Power study: I.e. Pr(rejection) when H0 is false # (one special case in here is when the H0 is actually true) muvalues <- seq(-.15,.15,.01) # When true mu < -0.15 and when true mu > 0.15, # the Pr(rejection) veers to 1 (full power) and it's not interesting. # First do this with sample size of 50 results <- NULL for (truth in muvalues) { results <- c(results, experiment(10000, 50, truth)) } par(mai=c(.8,.8,.2,.2)) plot(muvalues, results, type="l", lwd=2, ylim=c(0,1), xlab="True mu", ylab="Pr(Rejection of H0:mu=0)") abline(h=0.05, lty=2) # Now repeat this with sample size of 100 (should yield a higher power) results <- NULL for (truth in muvalues) { results <- c(results, experiment(10000, 100, truth)) } lines(muvalues, results, lwd=2, col="blue") legend(x=-0.15, y=.2, lwd=c(2,1,2), lty=c(1,2,1), cex=.8, col=c("black","black","blue"), bty="n", legend=c("N=50", "Size, 0.05", "N=100"))

Regression

Doing OLS

# Goal: Simulate a dataset from the OLS model and obtain # obtain OLS estimates for it. x <- runif(100, 0, 10) # 100 draws from U(0,10) y <- 2 + 3*x + rnorm(100) # beta = [2, 3] and sigma = 1 # You want to just look at OLS results? summary(lm(y ~ x)) # Suppose x and y were packed together in a data frame -- D <- data.frame(x,y) summary(lm(y ~ x, D)) # Full and elaborate steps -- d <- lm(y ~ x) # Learn about this object by saying ?lm and str(d) # Compact model results -- print(d) # Pretty graphics for regression diagnostics -- par(mfrow=c(2,2)) plot(d) d <- summary(d) # Detailed model results -- print(d) # Learn about this object by saying ?summary.lm and by saying str(d) cat("OLS gave slope of ", d$coefficients[2,1], "and a error sigma of ", d$sigma, "\n") ## I need to drop down to a smaller dataset now -- x <- runif(10) y <- 2 + 3*x + rnorm(10) m <- lm(y ~ x) # Now R supplies a wide range of generic functions which extract # useful things out of the result of estimation of many kinds of models. residuals(m) fitted(m) AIC(m) AIC(m, k=log(10)) # SBC vcov(m) logLik(m)

Dummy variables in regression

# Goal: "Dummy variables" in regression. # Suppose you have this data: people = data.frame( age = c(21,62,54,49,52,38), education = c("college", "school", "none", "school", "college", "none"), education.code = c( 2, 1, 0, 1, 2, 0 ) ) # Here people$education is a string categorical variable and # people$education.code is the same thing, with a numerical coding system. people # Note the structure of the dataset -- str(people) # The strings supplied for `education' have been treated (correctly) as # a factor, but education.code is being treated as an integer and not as # a factor. # We want to do a dummy variable regression. Normally you would have: # 1 Chosen college as the omitted category # 2 Made a dummy for "none" named educationnone # 3 Made a dummy for "school" named educationschool # 4 Ran a regression like lm(age ~ educationnone + educationschool, people) # But this is R. Things are cool: lm(age ~ education, people) # ! :-) # When you feed him an explanatory variable like education, he does all # these steps automatically. (He chose college as the omitted category). # If you use an integer coding, then the obvious thing goes wrong -- lm(age ~ education.code, people) # because he's thinking that education.code is an integer explanatory # variable. So you need to: lm(age ~ factor(education.code), people) # (he choose a different omitted category) # Alternatively, fix up the dataset -- people$education.code <- factor(people$education.code) lm(age ~ education.code, people) # # Bottom line: # Once the dataset has categorical variables correctly represented as factors, i.e. as str(people) # doing OLS in R induces automatic generation of dummy variables while leaving one out: lm(age ~ education, people) lm(age ~ education.code, people) # But what if you want the X matrix? m <- lm(age ~ education, people) model.matrix(m) # This is the design matrix that went into the regression m.

Generate latex tables of OLS results

# Goal: To make a latex table with results of an OLS regression. # Get an OLS -- x1 = runif(100) x2 = runif(100, 0, 2) y = 2 + 3*x1 + 4*x2 + rnorm(100) m = lm(y ~ x1 + x2) # and print it out prettily -- library(xtable) # Bare -- xtable(m) xtable(anova(m)) # Better -- print.xtable(xtable(m, caption="My regression", label="t:mymodel", digits=c(0,3,2,2,3)), type="latex", file="xtable_demo_ols.tex", table.placement = "tp", latex.environments=c("center", "footnotesize")) print.xtable(xtable(anova(m), caption="ANOVA of my regression", label="t:anova_mymodel"), type="latex", file="xtable_demo_anova.tex", table.placement = "tp", latex.environments=c("center", "footnotesize")) # Read the documentation of xtable. It actually knows how to generate # pretty latex tables for a lot more R objects than just OLS results. # It can be a workhorse for making tabular out of matrices, and # can also generate HTML.

`Least squares dummy variable' (LSDV) or `fixed effects' model

# Goals: Simulate a dataset from a "fixed effects" model, and # obtain "least squares dummy variable" (LSDV) estimates. # # We do this in the context of a familiar "earnings function" - # log earnings is quadratic in log experience, with parallel shifts by # education category. # Create an education factor with 4 levels -- education <- factor(sample(1:4,1000, replace=TRUE), labels=c("none", "school", "college", "beyond")) # Simulate an experience variable with a plausible range -- experience <- 30*runif(1000) # experience from 0 to 20 years # Make the intercept vary by education category between 4 given values -- intercept <- c(0.5,1,1.5,2)[education] # Simulate the log earnings -- log.earnings <- intercept + 2*experience - 0.05*experience*experience + rnorm(1000) A <- data.frame(education, experience, e2=experience*experience, log.earnings) summary(A) # The OLS path to LSDV -- summary(lm(log.earnings ~ -1 + education + experience + e2, A))

Estimate beta of Sun Microsystems using data from Yahoo finance

Elaborate version

# Goal: Using data from Yahoo finance, estimate the beta of Sun Microsystems # for weekly returns. # This is the `elaborate version' (36 lines), also see terse version (16 lines) library(tseries) # I know that the yahoo symbol for the common stock of Sun Microsystems # is "SUNW" and for the S&P 500 index is "^GSPC". prices <- cbind(get.hist.quote("SUNW", quote="Adj", start="2003-01-01", retclass="zoo"), get.hist.quote("^GSPC", quote="Adj", start="2003-01-01", retclass="zoo")) colnames(prices) <- c("SUNW", "SP500") prices <- na.locf(prices) # Copy last traded price when NA # To make weekly returns, you must have this incantation: nextfri.Date <- function(x) 7 * ceiling(as.numeric(x - 1)/7) + as.Date(1) # and then say weekly.prices <- aggregate(prices, nextfri.Date,tail,1) # Now we can make weekly returns -- r <- 100*diff(log(weekly.prices)) # Now shift out of zoo to become an ordinary matrix -- r <- coredata(r) rj <- r[,1] rM <- r[,2] d <- lm(rj ~ rM) # Market model estimation. print(summary(d)) # Make a pretty picture big <- max(abs(c(rj, rM))) range <- c(-big, big) plot(rM, rj, xlim=range, ylim=range, xlab="S&P 500 weekly returns (%)", ylab="SUNW weekly returns (%)") grid() abline(h=0, v=0) lines(rM, d$fitted.values, col="blue")

Terse version.

# Goal : Terse version of estimating the beta of Sun Microsystems # using weekly returns and data from Yahoo finance. # By Gabor Grothendieck. library(tseries) getstock <- function(x) c(get.hist.quote(x, quote = "Adj", start = "2003-01-01", compress = "w")) r <- diff(log(cbind(sp500 = getstock("^gspc"), sunw = getstock("sunw")))) mm <- lm(sunw ~ ., r) print(summary(mm)) range <- range(r, -r) plot(r[,1], r[,2], xlim = range, ylim = range, xlab = "S&P 500 weekly returns (%)", ylab = "SUNW weekly returns (%)") grid() abline(mm, h = 0, v = 0, col = "blue")

Nonlinear regression

# Goal: To do nonlinear regression, in three ways # By just supplying the function to be fit, # By also supplying the analytical derivatives, and # By having him analytically differentiate the function to be fit. # # John Fox has a book "An R and S+ companion to applied regression" # (abbreviated CAR). # An appendix associated with this book, titled # "Nonlinear regression and NLS" # is up on the web, and I strongly recommend that you go read it. # # This file is essentially from there (I have made slight changes). # First take some data - from the CAR book -- library(car) data(US.pop) attach(US.pop) plot(year, population, type="l", col="blue") # So you see, we have a time-series of the US population. We want to # fit a nonlinear model to it. library(stats) # Contains nonlinear regression time <- 0:20 pop.mod <- nls(population ~ beta1/(1 + exp(beta2 + beta3*time)), start=list(beta1=350, beta2=4.5, beta3=-0.3), trace=TRUE) # You just write in the formula that you want to fit, and supply # starting values. "trace=TRUE" makes him show iterations go by. summary(pop.mod) # Add in predicted values into the plot lines(year, fitted.values(pop.mod), lwd=3, col="red") # Look at residuals plot(year, residuals(pop.mod), type="b") abline(h=0, lty=2) # Using analytical derivatives: model <- function(beta1, beta2, beta3, time) { m <- beta1/(1+exp(beta2+beta3*time)) term <- exp(beta2 + beta3*time) gradient <- cbind((1+term)^-1, -beta1*(1+term)^-2 * term, -beta1*(1+term)^-2 * term * time) attr(m, 'gradient') <- gradient return(m) } summary(nls(population ~ model(beta1, beta2, beta3, time), start=list(beta1=350, beta2=4.5, beta3=-0.3))) # Using analytical derivatives, using automatic differentiation (!!!): model <- deriv(~ beta1/(1 + exp(beta2+beta3*time)), # rhs of model c('beta1', 'beta2', 'beta3'), # parameter names function(beta1, beta2, beta3, time){} # arguments for result ) summary(nls(population ~ model(beta1, beta2, beta3, time), start=list(beta1=350, beta2=4.5, beta3=-0.3)))

Standard tests

# Goal: Some of the standard tests # A classical setting -- x <- runif(100, 0, 10) # 100 draws from U(0,10) y <- 2 + 3*x + rnorm(100) # beta = [2, 3] and sigma is 1 d <- lm(y ~ x) # CLS results -- summary(d) library(sandwich) library(lmtest) # Durbin-Watson test -- dwtest(d, alternative="two.sided") # Breusch-Pagan test -- bptest(d) # Heteroscedasticity and autocorrelation consistent (HAC) tests coeftest(d, vcov=kernHAC) # Tranplant the HAC values back in -- library(xtable) sum.d <- summary(d) xtable(sum.d) sum.d$coefficients[1:2,1:4] <- coeftest(d, vcov=kernHAC)[1:2,1:4] xtable(sum.d)

Using orthogonal polynomials

# Goal: Experiment with fitting nonlinear functional forms in # OLS, using orthogonal polynomials to avoid difficulties with # near-singular design matrices that occur with ordinary polynomials. # Shriya Anand, Gabor Grothendieck, Ajay Shah, March 2006. # We will deal with noisy data from the d.g.p. y = sin(x) + e x <- seq(0, 2*pi, length.out=50) set.seed(101) y <- sin(x) + 0.3*rnorm(50) basicplot <- function(x, y, minx=0, maxx=3*pi, title=") { plot(x, y, xlim=c(minx,maxx), ylim=c(-2,2), main=title) lines(x, sin(x), col="blue", lty=2, lwd=2) abline(h=0, v=0) } x.outsample <- seq(0, 3*pi, length.out=100) # Severe multicollinearity with ordinary polynomials x2 <- x*x x3 <- x2*x x4 <- x3*x cor(cbind(x, x2, x3, x4)) # and a perfect design matrix using orthogonal polynomials m <- poly(x, 4) all.equal(cor(m), diag(4)) # Correlation matrix is I. par(mfrow=c(2,2)) # Ordinary polynomial regression -- p <- lm(y ~ x + I(x^2) + I(x^3) + I(x^4)) summary(p) basicplot(x, y, title="Polynomial, insample") # Data lines(x, fitted(p), col="red", lwd=3) # In-sample basicplot(x, y, title="Polynomial, out-of-sample") predictions.p <- predict(p, list(x = x.outsample)) # Out-of-sample lines(x.outsample, predictions.p, type="l", col="red", lwd=3) lines(x.outsample, sin(x.outsample), type="l", col="blue", lwd=2, lty=2) # As expected, polynomial fitting gives terrible results out of sample. # These IDENTICAL things using orthogonal polynomials d <- lm(y ~ poly(x, 4)) summary(d) basicplot(x, y, title="Orth. poly., insample") # Data lines(x, fitted(d), col="red", lwd=3) # In-sample basicplot(x, y, title="Orth. poly., out-of-sample") predictions.op <- predict(d, list(x = x.outsample)) # Out-of-sample lines(x.outsample, predictions.op, type="l", col="red", lwd=3) lines(x.outsample, sin(x.outsample), type="l", col="blue", lwd=2, lty=2) # predict(d) is magical! See ?SafePrediction # The story runs at two levels. First, when you do an OLS model, # predict()ion requires applying coefficients to an appropriate # X matrix. But one level deeper, the polynomial or orthogonal-polynomial # needs to be utilised for computing the X matrix based on the # supplied x.outsample data. # If you say p <- poly(x, n) # then you can say predict(p, new) where predict.poly() gets invoked. # And when you say predict(lm()), the full steps are worked out for # you automatically: predict.poly() is used to make an X matrix and # then prediction based on the regression results is done. all.equal(predictions.p, predictions.op) # Both paths are identical for this # (tame) problem.

A function that takes a model specification as an argument

# Goal: R syntax where model specification is an argument to a function. # Invent a dataset x <- runif(100); y <- runif(100); z <- 2 + 3*x + 4*y + rnorm(100) D <- data.frame(x=x, y=y, z=z) amodel <- function(modelstring) { summary(lm(modelstring, D)) } amodel(z ~ x) amodel(z ~ y)

Time-series analysis

ARMA estimation, diagnostics, forecasting

# Goals: ARMA modeling - estimation, diagnostics, forecasting. # 0. SETUP DATA rawdata <- c(-0.21,-2.28,-2.71,2.26,-1.11,1.71,2.63,-0.45,-0.11,4.79,5.07,-2.24,6.46,3.82,4.29,-1.47,2.69,7.95,4.46,7.28,3.43,-3.19,-3.14,-1.25,-0.50,2.25,2.77,6.72,9.17,3.73,6.72,6.04,10.62,9.89,8.23,5.37,-0.10,1.40,1.60,3.40,3.80,3.60,4.90,9.60,18.20,20.60,15.20,27.00,15.42,13.31,11.22,12.77,12.43,15.83,11.44,12.32,12.10,12.02,14.41,13.54,11.36,12.97,10.00,7.20,8.74,3.92,8.73,2.19,3.85,1.48,2.28,2.98,4.21,3.85,6.52,8.16,5.36,8.58,7.00,10.57,7.12,7.95,7.05,3.84,4.93,4.30,5.44,3.77,4.71,3.18,0.00,5.25,4.27,5.14,3.53,4.54,4.70,7.40,4.80,6.20,7.29,7.30,8.38,3.83,8.07,4.88,8.17,8.25,6.46,5.96,5.88,5.03,4.99,5.87,6.78,7.43,3.61,4.29,2.97,2.35,2.49,1.56,2.65,2.49,2.85,1.89,3.05,2.27,2.91,3.94,2.34,3.14,4.11,4.12,4.53,7.11,6.17,6.25,7.03,4.13,6.15,6.73,6.99,5.86,4.19,6.38,6.68,6.58,5.75,7.51,6.22,8.22,7.45,8.00,8.29,8.05,8.91,6.83,7.33,8.52,8.62,9.80,10.63,7.70,8.91,7.50,5.88,9.82,8.44,10.92,11.67) # Make a R timeseries out of the rawdata: specify frequency & startdate gIIP <- ts(rawdata, frequency=12, start=c(1991,4)) print(gIIP) plot.ts(gIIP, type="l", col="blue", ylab="IIP Growth (%)", lwd=2, main="Full data") grid() # Based on this, I decide that 4/1995 is the start of the sensible period. gIIP <- window(gIIP, start=c(1995,4)) print(gIIP) plot.ts(gIIP, type="l", col="blue", ylab="IIP Growth (%)", lwd=2, main="Estimation subset") grid() # Descriptive statistics about gIIP mean(gIIP); sd(gIIP); summary(gIIP); plot(density(gIIP), col="blue", main="(Unconditional) Density of IIP growth") acf(gIIP) # 1. ARMA ESTIMATION m.ar2 <- arima(gIIP, order = c(2,0,0)) print(m.ar2) # Print it out # 2. ARMA DIAGNOSTICS tsdiag(m.ar2) # His pretty picture of diagnostics ## Time series structure in errors print(Box.test(m.ar2$residuals, lag=12, type="Ljung-Box")); ## Sniff for ARCH print(Box.test(m.ar2$residuals^2, lag=12, type="Ljung-Box")); ## Eyeball distribution of residuals plot(density(m.ar2$residuals), col="blue", xlim=c(-8,8), main=paste("Residuals of AR(2)")) # 3. FORECASTING ## Make a picture of the residuals plot.ts(m.ar2$residual, ylab="Innovations", col="blue", lwd=2) s <- sqrt(m.ar2$sigma2) abline(h=c(-s,s), lwd=2, col="lightGray") p <- predict(m.ar2, n.ahead = 12) # Make 12 predictions. print(p) ## Watch the forecastability decay away from fat values to 0. ## sd(x) is the naive sigma. p$se is the prediction se. gain <- 100*(1-p$se/sd(gIIP)) plot.ts(gain, main="Gain in forecast s.d.", ylab="Per cent", col="blue", lwd=2) ## Make a pretty picture that puts it all together ts.plot(gIIP, p$pred, p$pred-1.96*p$se, p$pred+1.96*p$se, gpars=list(lty=c(1,1,2,2), lwd=c(2,2,1,1), ylab="IIP growth (%)", col=c("blue","red", "red", "red"))) grid() abline(h=mean(gIIP), lty=2, lwd=2, col="lightGray") legend(x="bottomleft", cex=0.8, bty="n", lty=c(1,1,2,2), lwd=c(2,1,1,2), col=c("blue", "red", "red", "lightGray"), legend=c("IIP", "AR(2) forecasts", "95% C.I.", "Mean IIP growth"))